We are analysing data about complaints for the presence of cocroaches in 10 residential buildings. Using a Bayesian framework, we modelled the number of complaints using a Poisson model. Diagnostic analysis of the model highlights some residual variability. We decided to try to improve our estimates adding one predictor and an exposure term to the Poisson model, but the situation did not improve. The ‘Poisson’ assumption on the model variance turned out to be too restrictive to model our data. The Negative Binomial model gave better results, but examining the standardized residual plot we noticed that some of them were still very large.
A posterior predictive check considering that the data are clustered by building.
library(rstan)
library(dplyr)
library(lubridate)
library(ggplot2)
library(bayesplot)
theme_set(bayesplot::theme_default())
rstan_options(auto_write=TRUE)
pest_data <- readRDS('pest_data.RDS')
str(pest_data)
## 'data.frame': 120 obs. of 14 variables:
## $ mus : num 0.369 0.359 0.282 0.129 0.452 ...
## $ building_id : int 37 37 37 37 37 37 37 37 37 37 ...
## $ wk_ind : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : Date, format: "2017-01-15" "2017-02-14" ...
## $ traps : num 8 8 9 10 11 11 10 10 9 9 ...
## $ floors : num 8 8 8 8 8 8 8 8 8 8 ...
## $ sq_footage_p_floor : num 5149 5149 5149 5149 5149 ...
## $ live_in_super : num 0 0 0 0 0 0 0 0 0 0 ...
## $ monthly_average_rent: num 3847 3847 3847 3847 3847 ...
## $ average_tenant_age : num 53.9 53.9 53.9 53.9 53.9 ...
## $ age_of_building : num 47 47 47 47 47 47 47 47 47 47 ...
## $ total_sq_foot : num 41192 41192 41192 41192 41192 ...
## $ month : num 1 2 3 4 5 6 7 8 9 10 ...
## $ complaints : num 1 3 0 1 0 0 4 3 2 2 ...
summary(pest_data)
## mus building_id wk_ind date
## Min. :-0.1008 Min. : 5.0 Min. : 1.00 Min. :2017-01-15
## 1st Qu.: 0.5279 1st Qu.:26.0 1st Qu.: 3.75 1st Qu.:2017-04-07
## Median : 1.0422 Median :46.0 Median : 6.50 Median :2017-06-29
## Mean : 1.1116 Mean :49.6 Mean : 6.50 Mean :2017-06-29
## 3rd Qu.: 1.6837 3rd Qu.:70.0 3rd Qu.: 9.25 3rd Qu.:2017-09-19
## Max. : 2.8030 Max. :98.0 Max. :12.00 Max. :2017-12-11
## traps floors sq_footage_p_floor live_in_super
## Min. : 1.000 Min. : 4.0 Min. :4186 Min. :0.0
## 1st Qu.: 6.000 1st Qu.: 8.0 1st Qu.:4770 1st Qu.:0.0
## Median : 7.000 Median :10.0 Median :5097 Median :0.0
## Mean : 7.033 Mean : 9.9 Mean :4991 Mean :0.3
## 3rd Qu.: 8.000 3rd Qu.:13.0 3rd Qu.:5206 3rd Qu.:1.0
## Max. :11.000 Max. :15.0 Max. :5740 Max. :1.0
## monthly_average_rent average_tenant_age age_of_building total_sq_foot
## Min. :3029 Min. :41.14 Min. :39.0 Min. :19217
## 1st Qu.:3564 1st Qu.:45.14 1st Qu.:47.0 1st Qu.:41192
## Median :3813 Median :48.20 Median :49.0 Median :47096
## Mean :3687 Mean :49.92 Mean :49.4 Mean :49248
## 3rd Qu.:3864 3rd Qu.:53.88 3rd Qu.:51.0 3rd Qu.:59251
## Max. :4019 Max. :65.18 Max. :60.0 Max. :78093
## month complaints
## Min. : 1.00 Min. : 0.000
## 1st Qu.: 3.75 1st Qu.: 1.000
## Median : 6.50 Median : 2.000
## Mean : 6.50 Mean : 3.658
## 3rd Qu.: 9.25 3rd Qu.: 5.250
## Max. :12.00 Max. :18.000
##number of buildings
N_buildings <- length(unique(pest_data$building_id))
N_buildings
## [1] 10
## arrange data into a list
stan_dat_simple <- list(
N = nrow(pest_data),
complaints = pest_data$complaints,
traps = pest_data$traps,
log_sq_foot = log(pest_data$total_sq_foot/1e4),
live_in_super = pest_data$live_in_super
)
str(stan_dat_simple)
## List of 5
## $ N : int 120
## $ complaints : num [1:120] 1 3 0 1 0 0 4 3 2 2 ...
## $ traps : num [1:120] 8 8 9 10 11 11 10 10 9 9 ...
## $ log_sq_foot : num [1:120] 1.42 1.42 1.42 1.42 1.42 ...
## $ live_in_super: num [1:120] 0 0 0 0 0 0 0 0 0 0 ...
comp_model_NB <- stan_model('multiple_NB_regression.stan')
fitted_model_NB <- sampling(comp_model_NB, data = stan_dat_simple)
##
## SAMPLING FOR MODEL 'multiple_NB_regression' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 9.5e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 0.881007 seconds (Warm-up)
## Chain 1: 0.716448 seconds (Sampling)
## Chain 1: 1.59746 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'multiple_NB_regression' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 5.7e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.57 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 0.857718 seconds (Warm-up)
## Chain 2: 0.691996 seconds (Sampling)
## Chain 2: 1.54971 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'multiple_NB_regression' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 5.7e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.57 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 0.787428 seconds (Warm-up)
## Chain 3: 0.728299 seconds (Sampling)
## Chain 3: 1.51573 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'multiple_NB_regression' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 3.6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.36 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 0.887704 seconds (Warm-up)
## Chain 4: 0.816093 seconds (Sampling)
## Chain 4: 1.7038 seconds (Total)
## Chain 4:
samps_NB <- rstan::extract(fitted_model_NB)
y_rep <- samps_NB$y_rep
ppc_stat_grouped(
y = stan_dat_simple$complaints,
yrep = y_rep,
group = pest_data$building_id,
stat = 'mean',
binwidth = 0.2
)
Select some of the variables containing information at the building level (live in super, age of building, average tentant age, and monthly average rent) and arrange them into a matrix called building_data (nrow = number of buildings).
N_months <- length(unique(pest_data$date))
# Add some IDs for building and month
pest_data <- pest_data %>%
mutate(
building_fac = factor(building_id, levels = unique(building_id)),
building_idx = as.integer(building_fac),
ids = rep(1:N_months, N_buildings),
mo_idx = lubridate::month(date)
)
# Center and rescale the building specific data
building_data <- pest_data %>%
select(
building_idx,
live_in_super,
age_of_building,
average_tenant_age,
monthly_average_rent
) %>%
unique() %>%
arrange(building_idx) %>%
select(-building_idx) %>%
scale(scale=FALSE) %>%
as.data.frame() %>%
mutate( # scale by constants
age_of_building = age_of_building / 10,
average_tenant_age = average_tenant_age / 10,
monthly_average_rent = monthly_average_rent / 1000
) %>%
as.matrix()
str(building_data)
## num [1:10, 1:4] -0.3 -0.3 -0.3 -0.3 0.7 -0.3 -0.3 0.7 -0.3 0.7 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "live_in_super" "age_of_building" "average_tenant_age" "monthly_average_rent"
stan_dat_hier <-
with(pest_data,
list(complaints = complaints,
traps = traps,
N = length(traps),
J = N_buildings,
log_sq_foot = log(pest_data$total_sq_foot/1e4),
building_data = building_data,
mo_idx = as.integer(as.factor(date)),
K = 4,
building_idx = building_idx
)
)
str(stan_dat_hier)
## List of 9
## $ complaints : num [1:120] 1 3 0 1 0 0 4 3 2 2 ...
## $ traps : num [1:120] 8 8 9 10 11 11 10 10 9 9 ...
## $ N : int 120
## $ J : int 10
## $ log_sq_foot : num [1:120] 1.42 1.42 1.42 1.42 1.42 ...
## $ building_data: num [1:10, 1:4] -0.3 -0.3 -0.3 -0.3 0.7 -0.3 -0.3 0.7 -0.3 0.7 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "live_in_super" "age_of_building" "average_tenant_age" "monthly_average_rent"
## $ mo_idx : int [1:120] 1 2 3 4 5 6 7 8 9 10 ...
## $ K : num 4
## $ building_idx : int [1:120] 1 1 1 1 1 1 1 1 1 1 ...
Model the variation across buildings using a varying intercept Negative Binomial hierarchical model.
Using stan write, compile and fit the following varying intercept model
\[ \text{complaints}_{ib} \sim \text{Neg-Binomial}(\lambda_{ib}, \phi) \\ \lambda_{ib} = \exp{(\eta_{ib})} \\ \eta_{ib} = \mu_{b(i)} + \beta \, {\rm traps}_{i} + \text{log_sq_foot}_i \\ \mu_b \sim \text{Normal}(\alpha + \beta_{\rm super}\, {\rm super}_b + \ldots + \beta_{\rm mar}\, {\rm mar}_b , \sigma_{\mu}) \]
comp_model_NB_hier <- stan_model('hier_NB_regression.stan')
fitted_model_NB_hier <- sampling(comp_model_NB_hier, data = stan_dat_hier,
chains = 4, #cores = 4,
iter = 4000)
##
## SAMPLING FOR MODEL 'hier_NB_regression' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 7.6e-05 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.76 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 1: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 1: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 1: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 1: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 1: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 1: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 1: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 1: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 1: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 1: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 1: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 2.66079 seconds (Warm-up)
## Chain 1: 1.99953 seconds (Sampling)
## Chain 1: 4.66032 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'hier_NB_regression' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 4.9e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.49 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 2: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 2: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 2: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 2: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 2: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 2: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 2: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 2: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 2: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 2: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 2: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 2.93057 seconds (Warm-up)
## Chain 2: 3.45877 seconds (Sampling)
## Chain 2: 6.38934 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'hier_NB_regression' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 4.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.45 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 3: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 3: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 3: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 3: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 3: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 3: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 3: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 3: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 3: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 3: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 3: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 2.55586 seconds (Warm-up)
## Chain 3: 2.57818 seconds (Sampling)
## Chain 3: 5.13404 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'hier_NB_regression' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 6e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.6 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 4000 [ 0%] (Warmup)
## Chain 4: Iteration: 400 / 4000 [ 10%] (Warmup)
## Chain 4: Iteration: 800 / 4000 [ 20%] (Warmup)
## Chain 4: Iteration: 1200 / 4000 [ 30%] (Warmup)
## Chain 4: Iteration: 1600 / 4000 [ 40%] (Warmup)
## Chain 4: Iteration: 2000 / 4000 [ 50%] (Warmup)
## Chain 4: Iteration: 2001 / 4000 [ 50%] (Sampling)
## Chain 4: Iteration: 2400 / 4000 [ 60%] (Sampling)
## Chain 4: Iteration: 2800 / 4000 [ 70%] (Sampling)
## Chain 4: Iteration: 3200 / 4000 [ 80%] (Sampling)
## Chain 4: Iteration: 3600 / 4000 [ 90%] (Sampling)
## Chain 4: Iteration: 4000 / 4000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 2.47614 seconds (Warm-up)
## Chain 4: 2.14333 seconds (Sampling)
## Chain 4: 4.61948 seconds (Total)
## Chain 4:
Do you have any divergent transitions?
samps_hier_NB <- rstan::extract(fitted_model_NB_hier)
print(fitted_model_NB_hier, pars = c('sigma_mu','beta','alpha','phi','mu'))
## Inference for Stan model: hier_NB_regression.
## 4 chains, each with iter=4000; warmup=2000; thin=1;
## post-warmup draws per chain=2000, total post-warmup draws=8000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## sigma_mu 0.26 0.01 0.17 0.03 0.13 0.22 0.34 0.67 359 1.01
## beta -0.23 0.00 0.06 -0.35 -0.27 -0.23 -0.19 -0.11 978 1.00
## alpha 1.24 0.01 0.43 0.38 0.97 1.23 1.53 2.11 995 1.00
## phi 1.59 0.01 0.34 1.04 1.36 1.54 1.78 2.38 3705 1.00
## mu[1] 1.25 0.02 0.55 0.10 0.91 1.23 1.61 2.35 1060 1.00
## mu[2] 1.20 0.02 0.53 0.15 0.86 1.18 1.55 2.26 1011 1.01
## mu[3] 1.39 0.01 0.48 0.43 1.08 1.38 1.70 2.35 1039 1.00
## mu[4] 1.43 0.02 0.49 0.45 1.12 1.40 1.74 2.41 1046 1.00
## mu[5] 1.07 0.01 0.42 0.21 0.80 1.08 1.34 1.89 1298 1.00
## mu[6] 1.15 0.01 0.48 0.19 0.84 1.17 1.46 2.08 1138 1.00
## mu[7] 1.46 0.02 0.51 0.42 1.13 1.48 1.78 2.47 1124 1.00
## mu[8] 1.24 0.01 0.42 0.42 0.96 1.22 1.51 2.07 1117 1.00
## mu[9] 1.40 0.02 0.57 0.23 1.02 1.44 1.76 2.49 1110 1.00
## mu[10] 0.84 0.01 0.36 0.15 0.60 0.82 1.08 1.57 1279 1.00
##
## Samples were drawn using NUTS(diag_e) at Thu May 6 12:56:20 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
mcmc_trace(
as.array(fitted_model_NB_hier,pars = 'sigma_mu'),
np = nuts_params(fitted_model_NB_hier),
window = c(500,1000)
)
# assign to object so we can compare to another plot later
scatter_with_divs <- mcmc_scatter(
as.array(fitted_model_NB_hier),
pars = c("mu[4]", 'sigma_mu'),
transform = list('sigma_mu' = "log"),
np = nuts_params(fitted_model_NB_hier)
)
scatter_with_divs
N_sims <- 1000
log_sigma <- rep(NA, N_sims)
theta <- rep(NA, N_sims)
for (j in 1:N_sims) {
log_sigma[j] <- rnorm(1, mean = 0, sd = 1)
theta[j] <- rnorm(1, mean = 0, sd = exp(log_sigma[j]))
}
draws <- cbind("mu" = theta, "log(sigma_mu)" = log_sigma)
mcmc_scatter(draws)
parcoord_with_divs <- mcmc_parcoord(
as.array(fitted_model_NB_hier, pars = c("sigma_mu", "mu")),
np = nuts_params(fitted_model_NB_hier)
)
parcoord_with_divs
We see evidence that our problems concentrate when \(\texttt{sigma_mu}\) is small. Reparametrize the model using the non-centered parametrization and check diagnostics. The distribution of \(\texttt{mu}\) does not change: \[ \mu_b \sim \text{Normal}(\alpha + \texttt{building_data} \, \zeta, \sigma_{\mu}) \] We add an auxiliary variable in the parameters block \(\texttt{mu_raw}\sim \text{Normal}(0, 1)\) and we move \(\texttt{mu}\) in the transformed parameters block.
transformed parameters {
vector[J] mu;
mu = alpha + building_data * zeta + sigma_mu * mu_raw;
}
comp_model_NB_hier_ncp <- stan_model('hier_NB_regression_ncp.stan')
fitted_model_NB_hier_ncp <- sampling(comp_model_NB_hier_ncp, data = stan_dat_hier, #cores=4,
chains = 4)
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000122 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.22 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 1.8373 seconds (Warm-up)
## Chain 1: 1.05436 seconds (Sampling)
## Chain 1: 2.89166 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 7.3e-05 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.73 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 1.76246 seconds (Warm-up)
## Chain 2: 1.12923 seconds (Sampling)
## Chain 2: 2.89168 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 4.1e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.41 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 1.824 seconds (Warm-up)
## Chain 3: 0.992336 seconds (Sampling)
## Chain 3: 2.81634 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 7.4e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.74 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 1.81885 seconds (Warm-up)
## Chain 4: 1.02889 seconds (Sampling)
## Chain 4: 2.84774 seconds (Total)
## Chain 4:
print(fitted_model_NB_hier_ncp, pars = c('sigma_mu','beta','alpha','phi','mu'))
## Inference for Stan model: hier_NB_regression_ncp.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## sigma_mu 0.24 0.01 0.18 0.01 0.11 0.20 0.33 0.67 1156 1
## beta -0.23 0.00 0.06 -0.35 -0.27 -0.23 -0.19 -0.11 2983 1
## alpha 1.27 0.01 0.43 0.40 0.99 1.27 1.55 2.12 2996 1
## phi 1.58 0.00 0.34 1.03 1.33 1.54 1.78 2.36 5206 1
## mu[1] 1.29 0.01 0.55 0.18 0.92 1.30 1.66 2.33 3035 1
## mu[2] 1.23 0.01 0.54 0.15 0.89 1.25 1.60 2.24 3048 1
## mu[3] 1.40 0.01 0.48 0.44 1.09 1.41 1.73 2.36 3125 1
## mu[4] 1.44 0.01 0.49 0.46 1.11 1.44 1.77 2.41 3138 1
## mu[5] 1.09 0.01 0.42 0.26 0.81 1.09 1.36 1.90 3446 1
## mu[6] 1.19 0.01 0.48 0.23 0.87 1.20 1.51 2.10 3257 1
## mu[7] 1.47 0.01 0.52 0.42 1.11 1.48 1.82 2.47 3587 1
## mu[8] 1.26 0.01 0.43 0.43 0.97 1.26 1.54 2.11 3325 1
## mu[9] 1.43 0.01 0.55 0.32 1.07 1.45 1.79 2.49 3142 1
## mu[10] 0.87 0.01 0.37 0.16 0.62 0.87 1.11 1.60 3940 1
##
## Samples were drawn using NUTS(diag_e) at Thu May 6 12:57:33 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
scatter_no_divs <- mcmc_scatter(
as.array(fitted_model_NB_hier_ncp),
pars = c("mu[4]", 'sigma_mu'),
transform = list('sigma_mu' = "log"),
np = nuts_params(fitted_model_NB_hier_ncp)
)
bayesplot_grid(scatter_with_divs, scatter_no_divs,
grid_args = list(ncol = 2), ylim = c(-11, 1))
parcoord_no_divs <- mcmc_parcoord(
as.array(fitted_model_NB_hier_ncp, pars = c("sigma_mu", "mu")),
np = nuts_params(fitted_model_NB_hier_ncp)
)
bayesplot_grid(parcoord_with_divs, parcoord_no_divs,
ylim = c(-3, 3))
samps_NB_hier_ncp <- rstan::extract(fitted_model_NB_hier_ncp, pars = c('y_rep','inv_phi'))
y_rep <- as.matrix(fitted_model_NB_hier_ncp, pars = "y_rep")
ppc_dens_overlay(stan_dat_hier$complaints, y_rep[1:200,])
ppc_stat_grouped(
y = stan_dat_hier$complaints,
yrep = y_rep,
group = pest_data$building_id,
stat = 'mean',
binwidth = 0.5
)
ppc_stat_grouped(
y = stan_dat_hier$complaints,
yrep = y_rep,
group = pest_data$building_id,
stat = 'sd',
binwidth = 0.5
)
ppc_intervals(
y = stan_dat_hier$complaints,
yrep = y_rep,
x = stan_dat_hier$traps
) +
labs(x = "Number of traps", y = "Number of complaints")
mean_y_rep <- colMeans(y_rep)
mean_inv_phi <- mean(as.matrix(fitted_model_NB_hier_ncp, pars = "inv_phi"))
std_resid <- (stan_dat_hier$complaints - mean_y_rep) / sqrt(mean_y_rep + mean_y_rep^2*mean_inv_phi)
qplot(mean_y_rep, std_resid) + hline_at(2) + hline_at(-2)
Try to expand the previous model with varying slope.
\[ \text{complaints}_{ib} \sim \text{Neg-Binomial}(\lambda_{ib}, \phi) \\ \lambda_{ib} = \exp{(\eta_{ib})}\\ \eta_{ib} = \mu_{b(i)} + \kappa_{b(i)} \, \texttt{traps}_{ib} + \text{log_sq_foot}_i \\ \mu_b \sim \text{Normal}(\alpha + \texttt{building_data} \, \zeta, \sigma_{\mu}) \\ \kappa_b \sim \text{Normal}(\beta + \texttt{building_data} \, \gamma, \sigma_{\kappa}) \]
stan_dat_hier <- readRDS('pest_data_longer_stan_dat.RDS')
comp_model_NB_hier_slopes <- stan_model('hier_NB_regression_ncp_slopes_mod.stan')
fitted_model_NB_hier_slopes <-
sampling(
comp_model_NB_hier_slopes,
data = stan_dat_hier,
chains = 4, #cores = 4,
control = list(adapt_delta = 0.95)
)
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp_slopes_mod' NOW (CHAIN 1).
## Chain 1:
## Chain 1: Gradient evaluation took 0.000144 seconds
## Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 1.44 seconds.
## Chain 1: Adjust your expectations accordingly!
## Chain 1:
## Chain 1:
## Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 1:
## Chain 1: Elapsed Time: 17.9308 seconds (Warm-up)
## Chain 1: 12.5868 seconds (Sampling)
## Chain 1: 30.5176 seconds (Total)
## Chain 1:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp_slopes_mod' NOW (CHAIN 2).
## Chain 2:
## Chain 2: Gradient evaluation took 0.000165 seconds
## Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 1.65 seconds.
## Chain 2: Adjust your expectations accordingly!
## Chain 2:
## Chain 2:
## Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 2:
## Chain 2: Elapsed Time: 16.9769 seconds (Warm-up)
## Chain 2: 11.1863 seconds (Sampling)
## Chain 2: 28.1632 seconds (Total)
## Chain 2:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp_slopes_mod' NOW (CHAIN 3).
## Chain 3:
## Chain 3: Gradient evaluation took 9.5e-05 seconds
## Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.95 seconds.
## Chain 3: Adjust your expectations accordingly!
## Chain 3:
## Chain 3:
## Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 3:
## Chain 3: Elapsed Time: 16.6154 seconds (Warm-up)
## Chain 3: 16.034 seconds (Sampling)
## Chain 3: 32.6494 seconds (Total)
## Chain 3:
##
## SAMPLING FOR MODEL 'hier_NB_regression_ncp_slopes_mod' NOW (CHAIN 4).
## Chain 4:
## Chain 4: Gradient evaluation took 8.7e-05 seconds
## Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.87 seconds.
## Chain 4: Adjust your expectations accordingly!
## Chain 4:
## Chain 4:
## Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
## Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
## Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
## Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
## Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
## Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
## Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
## Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
## Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
## Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
## Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
## Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
## Chain 4:
## Chain 4: Elapsed Time: 17.0269 seconds (Warm-up)
## Chain 4: 5.93226 seconds (Sampling)
## Chain 4: 22.9591 seconds (Total)
## Chain 4:
mcmc_hist(
as.matrix(fitted_model_NB_hier_slopes, pars = "sigma_kappa"),
binwidth = 0.005
)
print(fitted_model_NB_hier_slopes, pars = c('kappa','beta','alpha','phi','sigma_mu','sigma_kappa','mu'))
## Inference for Stan model: hier_NB_regression_ncp_slopes_mod.
## 4 chains, each with iter=2000; warmup=1000; thin=1;
## post-warmup draws per chain=1000, total post-warmup draws=4000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## kappa[1] -0.02 0.00 0.08 -0.14 -0.07 -0.03 0.02 0.16 941 1
## kappa[2] -0.41 0.00 0.10 -0.62 -0.47 -0.41 -0.35 -0.24 1575 1
## kappa[3] -0.59 0.00 0.10 -0.79 -0.65 -0.59 -0.52 -0.40 4694 1
## kappa[4] -0.22 0.00 0.07 -0.36 -0.26 -0.22 -0.18 -0.09 2549 1
## kappa[5] -0.60 0.00 0.09 -0.78 -0.66 -0.60 -0.54 -0.42 3074 1
## kappa[6] -0.43 0.00 0.10 -0.65 -0.49 -0.43 -0.37 -0.24 1960 1
## kappa[7] -0.31 0.00 0.07 -0.44 -0.35 -0.31 -0.26 -0.18 5383 1
## kappa[8] -0.23 0.00 0.15 -0.55 -0.32 -0.22 -0.13 0.05 1979 1
## kappa[9] 0.08 0.00 0.06 -0.04 0.04 0.08 0.12 0.20 5326 1
## kappa[10] -0.72 0.00 0.16 -1.01 -0.82 -0.73 -0.63 -0.38 1196 1
## beta -0.34 0.00 0.06 -0.47 -0.38 -0.34 -0.31 -0.22 1797 1
## alpha 1.41 0.01 0.30 0.81 1.22 1.42 1.60 1.98 2303 1
## phi 1.60 0.00 0.19 1.26 1.47 1.59 1.72 2.00 3062 1
## sigma_mu 0.47 0.02 0.39 0.02 0.17 0.38 0.69 1.46 640 1
## sigma_kappa 0.12 0.00 0.08 0.02 0.07 0.10 0.15 0.36 636 1
## mu[1] 0.30 0.02 0.72 -1.38 -0.08 0.41 0.78 1.42 924 1
## mu[2] 1.64 0.01 0.51 0.73 1.28 1.59 1.95 2.76 1506 1
## mu[3] 2.13 0.00 0.32 1.54 1.92 2.13 2.33 2.79 4965 1
## mu[4] 1.47 0.01 0.49 0.47 1.16 1.47 1.79 2.50 2947 1
## mu[5] 2.39 0.01 0.42 1.58 2.10 2.39 2.67 3.22 3497 1
## mu[6] 1.89 0.01 0.39 1.18 1.65 1.87 2.12 2.71 1967 1
## mu[7] 2.68 0.00 0.26 2.20 2.51 2.68 2.85 3.20 4941 1
## mu[8] -0.55 0.02 0.96 -2.40 -1.18 -0.55 0.06 1.48 2095 1
## mu[9] 0.21 0.01 0.59 -0.92 -0.18 0.19 0.61 1.36 5578 1
## mu[10] 1.85 0.04 1.07 -0.70 1.28 1.99 2.59 3.60 907 1
##
## Samples were drawn using NUTS(diag_e) at Thu May 6 13:00:34 2021.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
mcmc_hist(
as.matrix(fitted_model_NB_hier_slopes, pars = "beta"),
binwidth = 0.005
)
y_rep <- as.matrix(fitted_model_NB_hier_slopes, pars = "y_rep")
ppc_dens_overlay(
y = stan_dat_hier$complaints,
yrep = y_rep[1:200,]
)